Data Visualization and Statistics¶
Libraries and Configurations¶
Import configuration files
In [ ]:
from configparser import ConfigParser
config = ConfigParser()
config.read("../config.ini")
Out[ ]:
['../config.ini']
Import data libraries
In [ ]:
import pandas as pd
# Import label encoder
from sklearn import preprocessing
Import other libraries
In [ ]:
from rich.progress import Progress
from rich import traceback
traceback.install()
Out[ ]:
<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x105bb9390>>
Custom helper scripts
In [ ]:
%cd ..
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/.venv/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library. self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/data_exploration_cleaning
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/.venv/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library. self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
Import Data¶
In [ ]:
# Combined dataframe raw
combined_df_csv = config["DEFAULT"]["interim_path"] + "combined_df_balanced_encoded.csv"
In [ ]:
combined_df = pd.read_csv(combined_df_csv, index_col=0)
Fixing columns data types
In [ ]:
combined_df.dtypes
Out[ ]:
Timestamp object MAC Address object Channel int64 DS Channel float64 HT Capabilities int64 Extended Capabilities int64 Vendor Specific Tags int64 SSID object Supported Rates int64 Extended Supported Rates int64 VHT Capabilities int64 HE Capabilities int64 Length int64 Label object dtype: object
In [ ]:
# Converting Timestamp to datetime
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])
# Converting Label to string
combined_df["Label"] = combined_df["Label"].astype(str)
# Converting SSID to string
combined_df["SSID"] = combined_df["SSID"].astype(str)
# Converting MAC Address to string
combined_df["MAC Address"] = combined_df["MAC Address"].astype(str)
# Converting HT Capabilities to string
combined_df["HT Capabilities"] = combined_df["HT Capabilities"].astype(str)
# Converting Extended Capabilities to string
combined_df["Extended Capabilities"] = combined_df["Extended Capabilities"].astype(str)
# Converting Vendor Specific Tags to string
combined_df["Vendor Specific Tags"] = combined_df["Vendor Specific Tags"].astype(str)
# Converting Supported Rates to string
combined_df["Supported Rates"] = combined_df["Supported Rates"].astype(str)
# Converting Extended Supported Rates to string
combined_df["Extended Supported Rates"] = combined_df[
"Extended Supported Rates"
].astype(str)
# Converting VHT Capabilities to string
combined_df["VHT Capabilities"] = combined_df["VHT Capabilities"].astype(str)
# Converting HE Capabilities to string
combined_df["HE Capabilities"] = combined_df["HE Capabilities"].astype(str)
Data Visualization¶
Data Distribution¶
In [ ]:
combined_df
Out[ ]:
| Timestamp | MAC Address | Channel | DS Channel | HT Capabilities | Extended Capabilities | Vendor Specific Tags | SSID | Supported Rates | Extended Supported Rates | VHT Capabilities | HE Capabilities | Length | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-05-20 13:52:01.864465952 | d2:6b:aa:b5:fb:ed | 1 | 1.0 | 6 | 17 | -1 | -1 | 2 | 0 | -1 | 62 | 135 | iPhone12Pro_C |
| 1 | 2023-05-20 13:52:01.884716034 | d2:6b:aa:b5:fb:ed | 1 | 1.0 | 6 | 17 | -1 | -1 | 2 | 0 | -1 | 62 | 135 | iPhone12Pro_C |
| 2 | 2023-05-20 13:52:01.910542011 | d2:6b:aa:b5:fb:ed | 6 | 6.0 | 6 | 17 | -1 | -1 | 2 | 0 | -1 | 62 | 135 | iPhone12Pro_C |
| 3 | 2023-05-20 13:52:01.930788994 | d2:6b:aa:b5:fb:ed | 6 | 6.0 | 6 | 17 | -1 | -1 | 2 | 0 | -1 | 62 | 135 | iPhone12Pro_C |
| 4 | 2023-05-20 13:52:01.968745947 | d2:6b:aa:b5:fb:ed | 11 | 11.0 | 6 | 17 | -1 | -1 | 2 | 0 | -1 | 62 | 135 | iPhone12Pro_C |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13939 | 2021-07-07 12:02:57.579541922 | da:a1:19:00:17:f9 | 6 | 1.0 | 8 | 7 | 25 | -1 | 0 | 0 | 0 | 13 | 182 | XiaomiRedmiNote7_S |
| 13940 | 2021-07-07 11:32:04.533828019 | da:a1:19:1a:cc:8f | 6 | 8.0 | 8 | 7 | 25 | Wind3 HUB-6D1619 | 0 | 0 | 0 | 14 | 198 | XiaomiRedmiNote7_S |
| 13941 | 2021-07-07 11:46:50.089955091 | da:a1:19:41:c9:b1 | 11 | 5.0 | 8 | 7 | 25 | -1 | 0 | 0 | 0 | 32 | 143 | XiaomiRedmiNote7_S |
| 13942 | 2021-07-07 12:16:31.309731960 | da:a1:19:c7:24:b1 | 1 | 3.0 | 8 | 7 | 25 | -1 | 0 | 0 | 0 | 14 | 182 | XiaomiRedmiNote7_S |
| 13943 | 2021-07-07 11:19:07.014556885 | da:a1:19:05:11:80 | 1 | 5.0 | 8 | 7 | 25 | Wind3 HUB-6D1619 | 0 | 0 | 0 | 14 | 198 | XiaomiRedmiNote7_S |
36046 rows × 14 columns
Number of Probe Requests entries per device
In [ ]:
plotHelper.plot_label_distribution(combined_df, "Label")
Packet Length¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Length")
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "Length")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
SSIDs¶
Percentage of SSIDs disclosed in Probe Requests.
In [ ]:
plotHelper.plot_pie_chart(combined_df, "SSID")
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "SSID")
In [ ]:
print(
combined_df[combined_df["SSID"] != "-1"]["Label"].nunique(),
"devices disclosing SSID out of",
combined_df["Label"].nunique(),
"->",
round(
combined_df[combined_df["SSID"] != "-1"]["Label"].nunique()
/ combined_df["Label"].nunique()
* 100,
2,
),
"%",
)
7 devices disclosing SSID out of 33 -> 21.21 %
Channel Utilization¶
Channel usage per device
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Channel")
DS Channel usage per device
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "DS Channel")
DS Channel parameter distribution on actual Channel frequencies.
In [ ]:
plotHelper.plot_heatmap(combined_df, "Channel", "DS Channel")
Plotting Channel usage per single device
In [ ]:
plotHelper.plot_multi_pie_charts(combined_df, "Label", "Channel")
In [ ]:
plotHelper.plot_multi_pie_charts(combined_df, "Label", "DS Channel")
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "Channel")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "DS Channel")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Information Elements¶
HT Capabilities¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "HT Capabilities")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "HT Capabilities", other_percentage=0.01)
HE Capabilities¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "HE Capabilities")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "HE Capabilities")
Supported Rates¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Supported Rates")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Supported Rates", other_percentage=0.01)
Extended Supported Rates¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Extended Supported Rates")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Extended Supported Rates")
Vendor Specific Tags¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Vendor Specific Tags", "Label")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Vendor Specific Tags", other_percentage=0.03)
Extended Capabilities¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Extended Capabilities")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Extended Capabilities", other_percentage=0.03)
VHT Capabilities¶
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "VHT Capabilities")
In [ ]:
plotHelper.plot_pie_chart(combined_df, "VHT Capabilities")